Rio
listings <- vroom("http://data.insideairbnb.com/brazil/rj/rio-de-janeiro/2020-06-19/data/listings.csv.gz")
glimpse(listings)## Rows: 35,731
## Columns: 106
## $ id <dbl> 17878, 21280, 25026, 3...
## $ listing_url <chr> "https://www.airbnb.co...
## $ scrape_id <dbl> 2.02e+13, 2.02e+13, 2....
## $ last_scraped <date> 2020-06-19, 2020-06-1...
## $ name <chr> "Very Nice 2Br in Copa...
## $ summary <chr> "Discounts for long te...
## $ space <chr> "- Beautiful, sunny 2 ...
## $ description <chr> "Discounts for long te...
## $ experiences_offered <chr> "none", "none", "none"...
## $ neighborhood_overview <chr> "This is the one of th...
## $ notes <chr> NA, NA, "For any stay ...
## $ transit <chr> "Excellent location. C...
## $ access <chr> "The entire apartment ...
## $ interaction <chr> "I will be available t...
## $ house_rules <chr> "Please leave the apar...
## $ thumbnail_url <lgl> NA, NA, NA, NA, NA, NA...
## $ medium_url <lgl> NA, NA, NA, NA, NA, NA...
## $ picture_url <chr> "https://a0.muscache.c...
## $ xl_picture_url <lgl> NA, NA, NA, NA, NA, NA...
## $ host_id <dbl> 68997, 81163, 102840, ...
## $ host_url <chr> "https://www.airbnb.co...
## $ host_name <chr> "Matthias", "Jules", "...
## $ host_since <date> 2010-01-08, 2010-02-1...
## $ host_location <chr> "Rio de Janeiro, State...
## $ host_about <chr> "I am a journalist/w...
## $ host_response_time <chr> "within an hour", "wit...
## $ host_response_rate <chr> "100%", "100%", "70%",...
## $ host_acceptance_rate <chr> "100%", "90%", "77%", ...
## $ host_is_superhost <lgl> TRUE, FALSE, FALSE, TR...
## $ host_thumbnail_url <chr> "https://a0.muscache.c...
## $ host_picture_url <chr> "https://a0.muscache.c...
## $ host_neighbourhood <chr> "Copacabana", "Ipanema...
## $ host_listings_count <dbl> 2, 0, 3, 1, 1, 1, 7, 2...
## $ host_total_listings_count <dbl> 2, 0, 3, 1, 1, 1, 7, 2...
## $ host_verifications <chr> "['email', 'phone', 'r...
## $ host_has_profile_pic <lgl> TRUE, TRUE, TRUE, TRUE...
## $ host_identity_verified <lgl> TRUE, TRUE, TRUE, TRUE...
## $ street <chr> "Rio de Janeiro, Rio d...
## $ neighbourhood <chr> "Copacabana", "Ipanema...
## $ neighbourhood_cleansed <chr> "Copacabana", "Ipanema...
## $ neighbourhood_group_cleansed <lgl> NA, NA, NA, NA, NA, NA...
## $ city <chr> "Rio de Janeiro", "Rio...
## $ state <chr> "Rio de Janeiro", "RJ"...
## $ zipcode <chr> "22020-050", "22420-01...
## $ market <chr> "Rio De Janeiro", "Rio...
## $ smart_location <chr> "Rio de Janeiro, Brazi...
## $ country_code <chr> "BR", "BR", "BR", "BR"...
## $ country <chr> "Brazil", "Brazil", "B...
## $ latitude <dbl> -23.0, -23.0, -23.0, -...
## $ longitude <dbl> -43.2, -43.2, -43.2, -...
## $ is_location_exact <lgl> TRUE, TRUE, TRUE, TRUE...
## $ property_type <chr> "Condominium", "Apartm...
## $ room_type <chr> "Entire home/apt", "En...
## $ accommodates <dbl> 5, 6, 2, 3, 2, 2, 13, ...
## $ bathrooms <dbl> 1.0, 2.0, 1.0, 1.0, 1....
## $ bedrooms <dbl> 2, 2, 1, 1, 1, 1, 6, 1...
## $ beds <dbl> 2, 4, 2, 1, 1, 1, 6, 1...
## $ bed_type <chr> "Real Bed", "Real Bed"...
## $ amenities <chr> "{TV,\"Cable TV\",Inte...
## $ square_feet <dbl> NA, NA, NA, NA, NA, 0,...
## $ price <chr> "$191.00", "$424.00", ...
## $ weekly_price <chr> NA, "$4,945.00", NA, N...
## $ monthly_price <chr> NA, "$17,455.00", NA, ...
## $ security_deposit <chr> "$0.00", "$2,647.00", ...
## $ cleaning_fee <chr> "$250.00", "$265.00", ...
## $ guests_included <dbl> 2, 6, 2, 2, 2, 2, 7, 1...
## $ extra_people <chr> "$0.00", "$0.00", "$45...
## $ minimum_nights <dbl> 7, 5, 7, 2, 2, 3, 2, 3...
## $ maximum_nights <dbl> 180, 30, 60, 1125, 89,...
## $ minimum_minimum_nights <dbl> 7, 5, 7, 2, 2, 3, 2, 3...
## $ maximum_minimum_nights <dbl> 7, 5, 7, 2, 2, 3, 2, 3...
## $ minimum_maximum_nights <dbl> 1125, 30, 60, 1125, 89...
## $ maximum_maximum_nights <dbl> 1125, 30, 60, 1125, 89...
## $ minimum_nights_avg_ntm <dbl> 7.0, 5.0, 7.0, 2.0, 2....
## $ maximum_nights_avg_ntm <dbl> 1125, 30, 60, 1125, 89...
## $ calendar_updated <chr> "4 months ago", "4 mon...
## $ has_availability <lgl> TRUE, TRUE, TRUE, TRUE...
## $ availability_30 <dbl> 0, 19, 0, 0, 0, 23, 27...
## $ availability_60 <dbl> 6, 49, 0, 0, 0, 52, 54...
## $ availability_90 <dbl> 36, 79, 0, 0, 0, 71, 8...
## $ availability_365 <dbl> 287, 93, 154, 0, 170, ...
## $ calendar_last_scraped <date> 2020-06-19, 2020-06-1...
## $ number_of_reviews <dbl> 251, 94, 238, 282, 181...
## $ number_of_reviews_ltm <dbl> 20, 6, 7, 25, 23, 33, ...
## $ first_review <date> 2010-07-15, 2014-02-1...
## $ last_review <date> 2020-04-06, 2020-03-2...
## $ review_scores_rating <dbl> 93, 97, 94, 96, 94, 98...
## $ review_scores_accuracy <dbl> 9, 10, 9, 10, 10, 10, ...
## $ review_scores_cleanliness <dbl> 10, 10, 9, 10, 9, 10, ...
## $ review_scores_checkin <dbl> 10, 10, 9, 10, 10, 10,...
## $ review_scores_communication <dbl> 10, 10, 10, 10, 10, 10...
## $ review_scores_location <dbl> 10, 10, 10, 10, 10, 10...
## $ review_scores_value <dbl> 9, 10, 9, 10, 9, 10, 9...
## $ requires_license <lgl> FALSE, FALSE, FALSE, F...
## $ license <lgl> NA, NA, NA, NA, NA, NA...
## $ jurisdiction_names <lgl> NA, NA, NA, NA, NA, NA...
## $ instant_bookable <lgl> TRUE, FALSE, FALSE, TR...
## $ is_business_travel_ready <lgl> FALSE, FALSE, FALSE, F...
## $ cancellation_policy <chr> "moderate", "strict_14...
## $ require_guest_profile_picture <lgl> FALSE, FALSE, TRUE, FA...
## $ require_guest_phone_verification <lgl> FALSE, FALSE, TRUE, FA...
## $ calculated_host_listings_count <dbl> 1, 1, 3, 1, 1, 1, 5, 1...
## $ calculated_host_listings_count_entire_homes <dbl> 1, 1, 3, 1, 1, 1, 3, 0...
## $ calculated_host_listings_count_private_rooms <dbl> 0, 0, 0, 0, 0, 0, 2, 1...
## $ calculated_host_listings_count_shared_rooms <dbl> 0, 0, 0, 0, 0, 0, 0, 0...
## $ reviews_per_month <dbl> 2.08, 1.22, 1.95, 2.33...
Since some price-related variables are quantitative, we need to make sure it is stored as numeric in the dataframe.
listings <- listings %>%
mutate(price=parse_number(price)) %>%
mutate(cleaning_fee=parse_number(cleaning_fee)) %>%
mutate(extra_people=parse_number(extra_people))
typeof(listings$price)## [1] "double"
## [1] "double"
## [1] "double"
Now that
price,cleaning_fee,extra_peoplevariables are numeric only.
Use
skimr::skim()function to view the summary of the dataframe.
| Name | Piped data |
| Number of rows | 35731 |
| Number of columns | 106 |
| _______________________ | |
| Column type frequency: | |
| character | 43 |
| Date | 5 |
| logical | 16 |
| numeric | 42 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| listing_url | 0 | 1.00 | 34 | 37 | 0 | 35731 | 0 |
| name | 58 | 1.00 | 1 | 255 | 0 | 34488 | 0 |
| summary | 2588 | 0.93 | 1 | 1000 | 0 | 31747 | 0 |
| space | 14186 | 0.60 | 1 | 1000 | 0 | 20709 | 0 |
| description | 1433 | 0.96 | 1 | 1000 | 0 | 33409 | 0 |
| experiences_offered | 0 | 1.00 | 4 | 4 | 0 | 1 | 0 |
| neighborhood_overview | 15776 | 0.56 | 1 | 1000 | 0 | 18303 | 0 |
| notes | 24127 | 0.32 | 1 | 1000 | 0 | 10380 | 0 |
| transit | 15967 | 0.55 | 1 | 1000 | 0 | 18237 | 0 |
| access | 19597 | 0.45 | 1 | 1000 | 0 | 14325 | 0 |
| interaction | 18251 | 0.49 | 1 | 1000 | 0 | 15456 | 0 |
| house_rules | 17304 | 0.52 | 1 | 1000 | 0 | 16288 | 0 |
| picture_url | 0 | 1.00 | 81 | 146 | 0 | 35146 | 0 |
| host_url | 0 | 1.00 | 39 | 43 | 0 | 24992 | 0 |
| host_name | 5 | 1.00 | 1 | 35 | 0 | 6429 | 0 |
| host_location | 168 | 1.00 | 2 | 253 | 0 | 1180 | 0 |
| host_about | 18793 | 0.47 | 1 | 8586 | 0 | 9714 | 22 |
| host_response_time | 5 | 1.00 | 3 | 18 | 0 | 5 | 0 |
| host_response_rate | 5 | 1.00 | 2 | 4 | 0 | 63 | 0 |
| host_acceptance_rate | 5 | 1.00 | 2 | 4 | 0 | 100 | 0 |
| host_thumbnail_url | 5 | 1.00 | 55 | 106 | 0 | 24859 | 0 |
| host_picture_url | 5 | 1.00 | 57 | 109 | 0 | 24859 | 0 |
| host_neighbourhood | 13638 | 0.62 | 3 | 33 | 0 | 164 | 0 |
| host_verifications | 0 | 1.00 | 2 | 158 | 0 | 345 | 0 |
| street | 0 | 1.00 | 10 | 112 | 0 | 502 | 0 |
| neighbourhood | 1892 | 0.95 | 3 | 24 | 0 | 101 | 0 |
| neighbourhood_cleansed | 0 | 1.00 | 3 | 24 | 0 | 156 | 0 |
| city | 162 | 1.00 | 1 | 69 | 0 | 317 | 0 |
| state | 35 | 1.00 | 1 | 51 | 0 | 52 | 0 |
| zipcode | 1752 | 0.95 | 1 | 24 | 0 | 4261 | 1 |
| market | 43 | 1.00 | 4 | 21 | 0 | 11 | 0 |
| smart_location | 0 | 1.00 | 9 | 77 | 0 | 341 | 0 |
| country_code | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
| country | 0 | 1.00 | 6 | 6 | 0 | 1 | 0 |
| property_type | 0 | 1.00 | 3 | 22 | 0 | 36 | 0 |
| room_type | 0 | 1.00 | 10 | 15 | 0 | 4 | 0 |
| bed_type | 0 | 1.00 | 5 | 13 | 0 | 5 | 0 |
| amenities | 0 | 1.00 | 2 | 1267 | 0 | 33605 | 0 |
| weekly_price | 33335 | 0.07 | 7 | 11 | 0 | 758 | 0 |
| monthly_price | 33070 | 0.07 | 7 | 11 | 0 | 861 | 0 |
| security_deposit | 15680 | 0.56 | 5 | 10 | 0 | 737 | 0 |
| calendar_updated | 0 | 1.00 | 5 | 14 | 0 | 90 | 0 |
| cancellation_policy | 0 | 1.00 | 6 | 27 | 0 | 6 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| last_scraped | 0 | 1.00 | 2020-06-19 | 2020-06-21 | 2020-06-19 | 3 |
| host_since | 5 | 1.00 | 2009-03-29 | 2020-06-17 | 2016-01-26 | 3301 |
| calendar_last_scraped | 0 | 1.00 | 2020-06-19 | 2020-06-21 | 2020-06-19 | 3 |
| first_review | 14991 | 0.58 | 2010-06-07 | 2020-06-18 | 2018-02-14 | 2506 |
| last_review | 14991 | 0.58 | 2012-02-21 | 2020-06-19 | 2020-01-31 | 1529 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| thumbnail_url | 35731 | 0 | NaN | : |
| medium_url | 35731 | 0 | NaN | : |
| xl_picture_url | 35731 | 0 | NaN | : |
| host_is_superhost | 5 | 1 | 0.13 | FAL: 31028, TRU: 4698 |
| host_has_profile_pic | 5 | 1 | 1.00 | TRU: 35588, FAL: 138 |
| host_identity_verified | 5 | 1 | 0.27 | FAL: 26186, TRU: 9540 |
| neighbourhood_group_cleansed | 35731 | 0 | NaN | : |
| is_location_exact | 0 | 1 | 0.69 | TRU: 24496, FAL: 11235 |
| has_availability | 0 | 1 | 1.00 | TRU: 35731 |
| requires_license | 0 | 1 | 0.00 | FAL: 35731 |
| license | 35731 | 0 | NaN | : |
| jurisdiction_names | 35731 | 0 | NaN | : |
| instant_bookable | 0 | 1 | 0.40 | FAL: 21360, TRU: 14371 |
| is_business_travel_ready | 0 | 1 | 0.00 | FAL: 35731 |
| require_guest_profile_picture | 0 | 1 | 0.02 | FAL: 35176, TRU: 555 |
| require_guest_phone_verification | 0 | 1 | 0.02 | FAL: 35177, TRU: 554 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 2.13e+07 | 1.37e+07 | 1.79e+04 | 1.10e+07 | 1.71e+07 | 3.43e+07 | 4.38e+07 | ▅▇▃▃▆ |
| scrape_id | 0 | 1.00 | 2.02e+13 | 0.00e+00 | 2.02e+13 | 2.02e+13 | 2.02e+13 | 2.02e+13 | 2.02e+13 | ▁▁▇▁▁ |
| host_id | 0 | 1.00 | 8.50e+07 | 8.94e+07 | 1.17e+04 | 1.50e+07 | 5.62e+07 | 1.12e+08 | 3.51e+08 | ▇▃▁▁▁ |
| host_listings_count | 5 | 1.00 | 3.58e+01 | 3.12e+02 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 3.00e+00 | 3.32e+03 | ▇▁▁▁▁ |
| host_total_listings_count | 5 | 1.00 | 3.58e+01 | 3.12e+02 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 3.00e+00 | 3.32e+03 | ▇▁▁▁▁ |
| latitude | 0 | 1.00 | -2.30e+01 | 3.00e-02 | -2.31e+01 | -2.30e+01 | -2.30e+01 | -2.29e+01 | -2.28e+01 | ▁▇▃▁▁ |
| longitude | 0 | 1.00 | -4.32e+01 | 1.00e-01 | -4.37e+01 | -4.33e+01 | -4.32e+01 | -4.32e+01 | -4.31e+01 | ▁▁▂▂▇ |
| accommodates | 0 | 1.00 | 4.18e+00 | 2.63e+00 | 1.00e+00 | 2.00e+00 | 4.00e+00 | 5.00e+00 | 1.60e+02 | ▇▁▁▁▁ |
| bathrooms | 67 | 1.00 | 1.67e+00 | 1.05e+00 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 2.00e+01 | ▇▁▁▁▁ |
| bedrooms | 79 | 1.00 | 1.62e+00 | 1.09e+00 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 3.00e+01 | ▇▁▁▁▁ |
| beds | 331 | 0.99 | 2.54e+00 | 2.08e+00 | 0.00e+00 | 1.00e+00 | 2.00e+00 | 3.00e+00 | 6.20e+01 | ▇▁▁▁▁ |
| square_feet | 35279 | 0.01 | 5.25e+02 | 9.83e+02 | 0.00e+00 | 0.00e+00 | 1.94e+02 | 7.53e+02 | 1.08e+04 | ▇▁▁▁▁ |
| price | 0 | 1.00 | 7.86e+02 | 2.38e+03 | 0.00e+00 | 1.55e+02 | 3.00e+02 | 6.42e+02 | 1.32e+05 | ▇▁▁▁▁ |
| cleaning_fee | 11584 | 0.68 | 1.64e+02 | 1.94e+02 | 0.00e+00 | 8.90e+01 | 1.50e+02 | 2.00e+02 | 7.41e+03 | ▇▁▁▁▁ |
| guests_included | 0 | 1.00 | 1.73e+00 | 1.60e+00 | 1.00e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 3.60e+01 | ▇▁▁▁▁ |
| extra_people | 0 | 1.00 | 4.24e+01 | 1.11e+02 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 5.00e+01 | 5.55e+03 | ▇▁▁▁▁ |
| minimum_nights | 0 | 1.00 | 4.91e+00 | 2.23e+01 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 4.00e+00 | 1.12e+03 | ▇▁▁▁▁ |
| maximum_nights | 0 | 1.00 | 2.87e+04 | 5.29e+06 | 1.00e+00 | 3.00e+01 | 1.12e+03 | 1.12e+03 | 1.00e+09 | ▇▁▁▁▁ |
| minimum_minimum_nights | 0 | 1.00 | 4.78e+00 | 2.12e+01 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 4.00e+00 | 1.10e+03 | ▇▁▁▁▁ |
| maximum_minimum_nights | 0 | 1.00 | 5.12e+00 | 2.14e+01 | 1.00e+00 | 1.00e+00 | 3.00e+00 | 5.00e+00 | 1.10e+03 | ▇▁▁▁▁ |
| minimum_maximum_nights | 0 | 1.00 | 2.87e+04 | 5.29e+06 | 1.00e+00 | 4.00e+01 | 1.12e+03 | 1.12e+03 | 1.00e+09 | ▇▁▁▁▁ |
| maximum_maximum_nights | 0 | 1.00 | 2.87e+04 | 5.29e+06 | 1.00e+00 | 4.40e+01 | 1.12e+03 | 1.12e+03 | 1.00e+09 | ▇▁▁▁▁ |
| minimum_nights_avg_ntm | 0 | 1.00 | 4.89e+00 | 2.13e+01 | 1.00e+00 | 1.00e+00 | 2.10e+00 | 4.00e+00 | 1.10e+03 | ▇▁▁▁▁ |
| maximum_nights_avg_ntm | 0 | 1.00 | 2.87e+04 | 5.29e+06 | 1.00e+00 | 4.35e+01 | 1.12e+03 | 1.12e+03 | 1.00e+09 | ▇▁▁▁▁ |
| availability_30 | 0 | 1.00 | 1.65e+01 | 1.39e+01 | 0.00e+00 | 0.00e+00 | 2.70e+01 | 3.00e+01 | 3.00e+01 | ▆▁▁▁▇ |
| availability_60 | 0 | 1.00 | 3.44e+01 | 2.79e+01 | 0.00e+00 | 0.00e+00 | 5.40e+01 | 5.90e+01 | 6.00e+01 | ▆▁▁▁▇ |
| availability_90 | 0 | 1.00 | 5.25e+01 | 4.18e+01 | 0.00e+00 | 0.00e+00 | 8.30e+01 | 8.90e+01 | 9.00e+01 | ▆▁▁▁▇ |
| availability_365 | 0 | 1.00 | 1.72e+02 | 1.55e+02 | 0.00e+00 | 0.00e+00 | 1.68e+02 | 3.62e+02 | 3.65e+02 | ▇▃▂▁▇ |
| number_of_reviews | 0 | 1.00 | 9.63e+00 | 2.56e+01 | 0.00e+00 | 0.00e+00 | 1.00e+00 | 6.00e+00 | 4.06e+02 | ▇▁▁▁▁ |
| number_of_reviews_ltm | 0 | 1.00 | 3.06e+00 | 6.93e+00 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 2.00e+00 | 1.05e+02 | ▇▁▁▁▁ |
| review_scores_rating | 15873 | 0.56 | 9.46e+01 | 9.58e+00 | 2.00e+01 | 9.30e+01 | 9.80e+01 | 1.00e+02 | 1.00e+02 | ▁▁▁▁▇ |
| review_scores_accuracy | 15892 | 0.56 | 9.64e+00 | 9.20e-01 | 2.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_cleanliness | 15887 | 0.56 | 9.39e+00 | 1.13e+00 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_checkin | 15893 | 0.56 | 9.81e+00 | 7.20e-01 | 2.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_communication | 15885 | 0.56 | 9.76e+00 | 8.10e-01 | 2.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_location | 15891 | 0.56 | 9.75e+00 | 7.40e-01 | 2.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_value | 15889 | 0.56 | 9.29e+00 | 1.06e+00 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| calculated_host_listings_count | 0 | 1.00 | 8.07e+00 | 3.42e+01 | 1.00e+00 | 1.00e+00 | 1.00e+00 | 3.00e+00 | 3.19e+02 | ▇▁▁▁▁ |
| calculated_host_listings_count_entire_homes | 0 | 1.00 | 7.20e+00 | 3.39e+01 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 3.17e+02 | ▇▁▁▁▁ |
| calculated_host_listings_count_private_rooms | 0 | 1.00 | 7.50e-01 | 1.59e+00 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 1.00e+00 | 2.00e+01 | ▇▁▁▁▁ |
| calculated_host_listings_count_shared_rooms | 0 | 1.00 | 9.00e-02 | 6.50e-01 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 1.10e+01 | ▇▁▁▁▁ |
| reviews_per_month | 14991 | 0.58 | 5.50e-01 | 7.30e-01 | 1.00e-02 | 1.10e-01 | 2.60e-01 | 6.80e-01 | 8.55e+00 | ▇▁▁▁▁ |
There are 106 variables and 35,731 observations.
The cleaning fee numeric variable has 11,584 missing observations, however this is not an issue as some hosts prefer including the cleaning fee in their listed price rather than displaying it separately. In fact, we think that AirBnb clients feel like they are striking a happy medium when they don’t see any separate price to the main listed price. Moreover, clients could well be cleaner and more respectful of the accommodation when they don’t see that they are paying for a cleaning fee.
Other variables such as Access, Transit or House_rules also have more than 10,000 missing values. However once again, this is probably due to the host discretionary marketing decision, and these instructions are probably either obvious or included in other parts of the listing information rather than randomly missing.
Next, replace the missing values of cleaning_fee with an numeric 0.
listings <- listings %>%
mutate(cleaning_fee = case_when(
is.na(cleaning_fee) ~ 0,
TRUE ~ cleaning_fee
))
missing = listings %>% skimr::skim() %>% filter(skim_variable == 'cleaning_fee') %>% select(n_missing)
missing$n_missing## [1] 0
Now there are no longer any missing values of cleaning_fee.
Next, we look at the variable
property_typeand use thecount()function to determine how many categories there are and their frequency. Specifically, we like to know:
property_type <- listings %>%
group_by(property_type) %>%
summarise(n=n()) %>%
arrange(desc(n)) %>%
mutate(property_type, `Percent (%)`=round((n / nrow(listings)) * 100, 1))
property_typeCounting the different property types shows that the top 4 are Apartment, House, Condominium and Loft.
The top 4 property types thus account for 94.6% of the total listings, hence our decision to assign the other property types to the ‘Other’ category.
listings <- listings %>%
mutate(prop_type_simplified = case_when(
property_type %in% c("Apartment","House", "Condominium","Loft") ~ property_type,
TRUE ~ "Other"))
prop_type_simplified <- listings %>%
group_by(prop_type_simplified ) %>%
summarise(n=n()) %>%
arrange(desc(n)) %>%
mutate(prop_type_simplified, `Percent (%)`=round((n / nrow(listings)) * 100, 1))
prop_type_simplifiedAirbnb is most commonly used for travel purposes, i.e., as an alternative to traditional hotels. We only want to include listings in our regression analysis that are intended for travel purposes:
min_stay <- listings %>%
count(minimum_nights) %>%
arrange(desc(n))
min_stay <- mutate(min_stay, `Percent (%)`=round((n / nrow(listings)) * 100, 3))
min_stayThe most common value for minimum stay requirement is 1 night, with 2, 3, 5 and 4 nights following closely.
Some values seem very high, such as 1123 or 930, as they span longer than one year. This could be due to listing errors, scams or simply the host having put its flat on hold for an undefined period of time as it is unexpected for a guest to book a flat for such a long period of time.
For later analysis, we shall filter the airbnb data so that it only includes observations with minimum_nights <= 4.
An overview of the spatial distribution of AirBnB rentals whose minimum_nights is less than equal to four (4).
Create a new variable called price_4_nights that uses price, cleaning_fee, guests_included, and extra_people to calculate the total cost for two people to stay at the Airbnb property for 4 nights. This is the variable we want to explain.
In addition, we filter out those accommodating less than 2 guests, minimum_nights more than 4 nights, maximum_nights less than 4.
listings <- listings %>%
filter(accommodates >=2 & minimum_nights <=4 & maximum_nights >= 4 & guests_included <= 2 & !is.na(number_of_reviews) & !is.na(review_scores_rating)) %>%
mutate(price_4_nights=ifelse(guests_included==2, 4*(price+cleaning_fee), 4*(price+cleaning_fee+extra_people))) %>%
mutate(price_4_nights_log=log10(price_4_nights))Use histograms or density plots to examine the distributions of
price_4_nightsandlog(price_4_nights).
g1=ggplot(listings, aes(price_4_nights, fill=prop_type_simplified)) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + ggtitle("Density of Cost") +
geom_histogram(stat="density")
g2=ggplot(listings, aes(price_4_nights, fill=prop_type_simplified)) +
scale_x_continuous(breaks=c(50,100,200,300,500,1000,3000,8000,20000,40000), trans="log10") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + ggtitle("Density of Cost (log10)") +
geom_histogram(stat="density")
plot_list <- list(g1,g2)
ggm_density <- ggmatrix(
plots = plot_list,
nrow = 1, ncol = 2,
xAxisLabels = c("price_4_nights", "log10(price_4_nights)"),
yAxisLabels = c("Density"),
byrow = FALSE,
title = "",
legend = 1
)
ggm_densityAs we can see that
price_4_nightsis extremely skewed to left side, due to some outliers with extremely high prices.
Even though using
log(price_4_nights)removes the skew issue, it will also remove many details in analysis.
Since the skew issue is due to extreme outliers, we should be able to take 95% quantile to remove the outliers.
Next, do charting again with new data set.
price_max = quantile(listings$price_4_nights, c(0.95))[[1]]
listings <- listings %>%
filter(price_4_nights <= price_max)
ggplot(listings, aes(price_4_nights, fill=prop_type_simplified)) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + ggtitle("Density of Cost") +
geom_histogram(stat="density")As we can see from the above chart, quantile of 95% will give much better distribution. We will use the adjusted data set for analysis below.
Per project assignment, we fit a regression model called model1 with the following explanatory variables:
prop_type_simplified,number_of_reviews, andreview_scores_rating.
model1 <- lm(price_4_nights ~ prop_type_simplified + number_of_reviews + review_scores_rating, data=listings)
model1 %>% broom::tidy()From the summary:
prop_type_simplified is a significant predictor to price_4_nights. Apartment are most expensive, followed by House and Condominium; This is in line with our experience and expectation;
On the other hands, review_scores_rating has little relationship to price_4_nights;
Lastly, number_of_reviews has even less relationship to price_4_nights.
room_type)We want to determine if
room_typeis a significant predictor of the cost for 4 nights, given everything else in the model.
Now Create a new regression model called model2 that includes all of the explanantory variables in model1 plus
room_type.
model2 <- lm(price_4_nights ~ prop_type_simplified + number_of_reviews + review_scores_rating + room_type, data = listings)
model2 %>% broom::tidy()From the summary, we can tell:
room_type is a significant predictor of price_4_nights. Entire home/appt are most expensive, followed by Hotel room, Private room, and Shared room is cheapest.We can use
ggpairs2to get more insight about these variables related to each other.
ggpairs2 <- listings %>%
select(price_4_nights, number_of_reviews, review_scores_rating, prop_type_simplified, room_type) %>%
GGally::ggpairs(mapping = ggplot2::aes(colour=room_type), legend = 1) +
theme(legend.position = "bottom")
ggpairs2From the chart, we can tell:
review_scores_rating has a low coefficient -0.101 to price_4_nights; this is consistent to the model summary above. As we can see from the chart, listings within low to medium price range receive most review score feedback and more higher rating than rentals with high prices; However, it could be a result of higher consumer expectation, as the higher price they paid, the higher quality they would expect from the rentals.
number_of_reviews, given a very low coefficient 0.013, has even less relationship to price_4_nights. The distribution of number_of_reviews looks similar to that of price_4_nights. So it seems more a result of the number of visited customers.
Hotel room has least count; this mostly because of the nature of Airbnb’s business, which is more for private properties, and less for hotels.
Run car::vif() to check about colinearity
## GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.01 4 1.00
## number_of_reviews 1.01 1 1.01
## review_scores_rating 1.00 1 1.00
## GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.23 4 1.03
## number_of_reviews 1.02 1 1.01
## review_scores_rating 1.01 1 1.00
## room_type 1.24 3 1.04
For both models the variables are very little correlated.
Run huxtable::huxreg to compare the two models
#install.packages("huxreg")
huxtable::huxreg(model1, model2,
statistics = c('#observations' = 'nobs',
'R squared' = 'r.squared',
'Adj. R Squared' = 'adj.r.squared',
'Residual SE' = 'sigma'),
bold_signif = 0.05
) %>%
huxtable::set_caption('Comparison of models')| (1) | (2) | |
|---|---|---|
| (Intercept) | 1428.616 *** | 1495.232 *** |
| (78.514) | (72.898) | |
| prop_type_simplifiedCondominium | 29.318 | 47.740 |
| (31.758) | (29.478) | |
| prop_type_simplifiedHouse | -481.647 *** | -141.544 *** |
| (29.101) | (28.154) | |
| prop_type_simplifiedLoft | -144.891 ** | -299.894 *** |
| (46.391) | (43.194) | |
| prop_type_simplifiedOther | -313.479 *** | -171.544 *** |
| (31.286) | (30.664) | |
| number_of_reviews | -2.767 *** | -3.437 *** |
| (0.229) | (0.213) | |
| review_scores_rating | 1.470 | 3.078 *** |
| (0.824) | (0.766) | |
| room_typeHotel room | -551.811 *** | |
| (92.440) | ||
| room_typePrivate room | -704.118 *** | |
| (16.315) | ||
| room_typeShared room | -807.047 *** | |
| (57.379) | ||
| #observations | 12016 | 12016 |
| R squared | 0.039 | 0.173 |
| Adj. R Squared | 0.039 | 0.172 |
| Residual SE | 852.204 | 790.807 |
| *** p < 0.001; ** p < 0.01; * p < 0.05. | ||
model2is the better model. We use broom::tidy() and broom::glance() to get more about the model.
## # A tibble: 10 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1495. 72.9 20.5 6.37e-92
## 2 prop_type_simplifiedCondominium 47.7 29.5 1.62 1.05e- 1
## 3 prop_type_simplifiedHouse -142. 28.2 -5.03 5.04e- 7
## 4 prop_type_simplifiedLoft -300. 43.2 -6.94 4.04e-12
## 5 prop_type_simplifiedOther -172. 30.7 -5.59 2.26e- 8
## 6 number_of_reviews -3.44 0.213 -16.1 5.02e-58
## 7 review_scores_rating 3.08 0.766 4.02 5.87e- 5
## 8 room_typeHotel room -552. 92.4 -5.97 2.45e- 9
## 9 room_typePrivate room -704. 16.3 -43.2 0.
## 10 room_typeShared room -807. 57.4 -14.1 1.40e-44
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.173 0.172 791. 279. 0 9 -97228. 1.94e5 1.95e5
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
The equation is:
price_4_nights = 1495.23 +
47.74*(prop_type_simplifiedCondominium) +
-141.54*(prop_type_simplifiedHouse) +
-299.89*(prop_type_simplifiedLoft) +
-171.54*(prop_type_simplifiedOther) +
-3.44*(number_of_reviews) +
3.08*(review_scores_rating) +
-551.81(room_typeHotel room) +
-704.12*(room_typePrivate room) +
-807.05*(room_typeShared room)
Add real test here!
We want to know which variable should we use for the regression model? and why?
Now we create correlation matrix for as many variables as possible, in order to find out those matter most w.r.t
price_4_nights.
We then figure how are AirBnB prices distributed with those factors, decide which variable should you use for the regression model.
Some interested variables are in character format, so we change them to numeric for analysis later.
listings <- listings %>%
mutate(host_response_rate=parse_number(host_response_rate)) %>%
mutate(host_acceptance_rate=parse_number(host_acceptance_rate)) %>%
mutate(security_deposit=parse_number(security_deposit))# select interested variables
corr_listings <- listings %>%
filter(!is.na(host_response_rate) & !is.na(host_acceptance_rate)) %>%
select(price_4_nights,
host_response_rate,
host_acceptance_rate,
host_listings_count,
accommodates,
bedrooms,
beds,
bathrooms,
guests_included,
number_of_reviews,
number_of_reviews_ltm,
reviews_per_month,
review_scores_rating,
review_scores_checkin,
review_scores_cleanliness,
review_scores_accuracy,
review_scores_communication,
review_scores_location,
review_scores_value
)
names(corr_listings)## [1] "price_4_nights" "host_response_rate"
## [3] "host_acceptance_rate" "host_listings_count"
## [5] "accommodates" "bedrooms"
## [7] "beds" "bathrooms"
## [9] "guests_included" "number_of_reviews"
## [11] "number_of_reviews_ltm" "reviews_per_month"
## [13] "review_scores_rating" "review_scores_checkin"
## [15] "review_scores_cleanliness" "review_scores_accuracy"
## [17] "review_scores_communication" "review_scores_location"
## [19] "review_scores_value"
What we can tell from the matrix above:
accommodates, bedrooms, bathrooms, beds are the most significant positive predictors of price_4_nights;host_listings_count has some possible impact to price_4_nights but less significant;reviews_per_month, number_of_reviews, number_of_reviews_ltm negative related to price_4_nights;review_scores_rating has insignificant impact to price_4_nights, which quite contraindicate common sense;accommodate) significant predictors of price_4_nights?Yes
We decide on choosing a few explanatory variables that have low numbers of
n_uniquevalues for our factor variables:
neighbourhoodProperty_typeothers that are likely to influence price:
host_is_superhostReview_scores_cleanlinessReview_scores_locationminimum_nightssquare_feetbedroomsbathroomsNow we will analyze how neighbourhood related to price.
neighbourhoodLooking at the map, properties of outskirts of Rio seems reduce much.
neighbour_list <- listings %>%
filter(!is.na(neighbourhood)) %>%
group_by(neighbourhood) %>%
summarise(n=n(), prop_mean_price=mean(price_4_nights)) %>%
arrange(desc(n)) %>%
mutate(`Percent (%)`=round((n / nrow(listings)) * 100, 1))
neighbour_list## # A tibble: 78 x 4
## neighbourhood n prop_mean_price `Percent (%)`
## <chr> <int> <dbl> <dbl>
## 1 Copacabana 3540 1475. 29.5
## 2 Barra da Tijuca 1812 1626. 15.1
## 3 Ipanema 1103 1910. 9.2
## 4 Botafogo 661 1291. 5.5
## 5 Leblon 567 2026. 4.7
## 6 Santa Teresa 483 1076. 4
## 7 Recreio dos Bandeirantes 480 1499. 4
## 8 Flamengo 357 1245. 3
## 9 Centro 239 1008. 2
## 10 Leme 238 1433. 2
## # ... with 68 more rows
After removing missing observations and keeping only the 13 main neighbourhoods of Rio, namely the ones displaying more than 200 Air Bnb listings, we get a sample of 10,694 listings from which to start our analysis. This corresponds to a proportion of around 85% of the population for which there are no NA values and is therefore a good basis for our analysis.
main_neighbourhood <- listings %>% group_by(neighbourhood) %>% tally() %>% filter(!is.na(neighbourhood) & n > 200)
main_neighbourhood## # A tibble: 12 x 2
## neighbourhood n
## <chr> <int>
## 1 Barra da Tijuca 1812
## 2 Botafogo 661
## 3 Centro 239
## 4 Copacabana 3540
## 5 Flamengo 357
## 6 Ipanema 1103
## 7 Lapa 222
## 8 Laranjeiras 235
## 9 Leblon 567
## 10 Leme 238
## 11 Recreio dos Bandeirantes 480
## 12 Santa Teresa 483
## [1] 9937
# filter main neighbourhoods
listings_nb <- listings %>% filter(neighbourhood %in% c(main_neighbourhood$neighbourhood))
leaflet(data = listings_nb) %>%
addProviderTiles("OpenStreetMap.Mapnik") %>%
addCircleMarkers(lng = ~longitude,
lat = ~latitude,
radius = 1,
fillColor = "blue",
fillOpacity = 0.4,
popup = ~listing_url,
label = ~property_type)ggplot(listings_nb, aes(x = price_4_nights, fill = neighbourhood )) +
ggtitle("Count of Price by neighbourhood") +
geom_histogram(alpha = 0.5, position = "identity") ggplot(listings_nb, aes(neighbourhood, price_4_nights, fill = neighbourhood)) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ggtitle("Price Span per Neighbourhood") +
geom_boxplot()
Copacabanahas largest count of properties and widest price span; whileIpanemaandLeblonhave higher mean prices.
These are quite in line with our impression, as the three area are most popular for tourists.